import pandas as pd
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from autogluon.tabular import TabularDataset, TabularPredictor
ref
= pd.read_csv("~/Desktop/fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df df.head()
/tmp/ipykernel_3182735/3046116532.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
Unnamed: 0 | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
669418 | 669418 | 2019-10-12 18:21 | 4.089100e+18 | fraud_Haley, Jewess and Bechtelar | shopping_pos | 7.53 | Debra | Stark | F | 686 Linda Rest | ... | 32.3836 | -94.8653 | 24536 | Multimedia programmer | 1983-10-14 | d313353fa30233e5fab5468e852d22fc | 1350066071 | 32.202008 | -94.371865 | 0 |
32567 | 32567 | 2019-01-20 13:06 | 4.247920e+12 | fraud_Turner LLC | travel | 3.79 | Judith | Moss | F | 46297 Benjamin Plains Suite 703 | ... | 39.5370 | -83.4550 | 22305 | Television floor manager | 1939-03-09 | 88c65b4e1585934d578511e627fe3589 | 1327064760 | 39.156673 | -82.930503 | 0 |
156587 | 156587 | 2019-03-24 18:09 | 4.026220e+12 | fraud_Klein Group | entertainment | 59.07 | Debbie | Payne | F | 204 Ashley Neck Apt. 169 | ... | 41.5224 | -71.9934 | 4720 | Broadcast presenter | 1977-05-18 | 3bd9ede04b5c093143d5e5292940b670 | 1332612553 | 41.657152 | -72.595751 | 0 |
1020243 | 1020243 | 2020-02-25 15:12 | 4.957920e+12 | fraud_Monahan-Morar | personal_care | 25.58 | Alan | Parsons | M | 0547 Russell Ford Suite 574 | ... | 39.6171 | -102.4776 | 207 | Network engineer | 1955-12-04 | 19e16ee7a01d229e750359098365e321 | 1361805120 | 39.080346 | -103.213452 | 0 |
116272 | 116272 | 2019-03-06 23:19 | 4.178100e+15 | fraud_Kozey-Kuhlman | personal_care | 84.96 | Jill | Flores | F | 639 Cruz Islands | ... | 41.9488 | -86.4913 | 3104 | Horticulturist, commercial | 1981-03-29 | a0c8641ca1f5d6e243ed5a2246e66176 | 1331075954 | 42.502065 | -86.732664 | 0 |
5 rows × 23 columns
- 총 265,342건 거래 중 7,506건(2,83%)가 사기
이분그래프
-
이분그래프
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G, {(
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
= build_graph_bipartite(df, nx.Graph(name="Bipartite Undirect")) G_bu
삼분그래프
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
[(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
return G
= build_graph_tripartite(df, nx.Graph()) G_tu
지도학습(이분그래프)
from sklearn.utils import resample
= df[df.is_fraud==0]
df_majority = df[df.is_fraud==1]
df_minority
= resample(df_majority,
df_maj_dowsampled =len(df_minority),
n_samples=42)
random_state
= pd.concat([df_minority, df_maj_dowsampled])
df_downsampled
print(df_downsampled.is_fraud.value_counts())
= build_graph_bipartite(df_downsampled) G_down
1 6006
0 6006
Name: is_fraud, dtype: int64
from sklearn.model_selection import train_test_split
= train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
=0.20,
test_size=42) random_state
= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes))) train_graph.add_nodes_from(
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10) model_train
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00, 2.57it/s]
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges] test_embeddings
np.array(train_embeddings).shape
(9351, 128)
np.array(train_embeddings)
array([[4.0369573e-01, 2.0337313e-01, 2.1946652e-01, ..., 1.7150687e-01,
3.6633116e-01, 3.2048109e-01],
[6.0373070e-03, 1.6968289e-01, 8.6983815e-03, ..., 2.2079267e-01,
3.2768153e-02, 2.3883855e-02],
[9.2083057e-03, 1.8300842e-02, 8.2615782e-03, ..., 4.4274908e-02,
2.1799646e-01, 2.3926771e-03],
...,
[1.7281795e-01, 4.6769153e-02, 1.9730711e-01, ..., 6.4412162e-02,
3.3814883e-01, 2.4217861e-02],
[6.3609913e-02, 2.2192889e-01, 1.1514757e-04, ..., 9.1643520e-02,
1.5498386e-02, 2.4329810e-01],
[2.6236567e-05, 4.7491617e-03, 9.5967706e-03, ..., 1.5650114e-01,
1.1875462e-02, 9.1554008e-02]], dtype=float32)
np.array(train_labels).shape
(9351,)
# DataFrame 생성
= [f'embedding_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
df_data
= pd.DataFrame(data=train_labels, columns=['label'])
df_labels
# DataFrame 합치기
= pd.concat([df_data, df_labels], axis=1) df
df
embedding_0 | embedding_1 | embedding_2 | embedding_3 | embedding_4 | embedding_5 | embedding_6 | embedding_7 | embedding_8 | embedding_9 | ... | embedding_119 | embedding_120 | embedding_121 | embedding_122 | embedding_123 | embedding_124 | embedding_125 | embedding_126 | embedding_127 | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.403696 | 0.203373 | 0.219467 | 0.046628 | 0.056515 | 0.018408 | 0.022522 | 0.080388 | 0.098083 | 0.005314 | ... | 0.136802 | 0.000194 | 0.063601 | 0.175472 | 0.044392 | 0.010460 | 0.171507 | 0.366331 | 0.320481 | 0 |
1 | 0.006037 | 0.169683 | 0.008698 | 0.141072 | 0.029439 | 0.097091 | 0.005095 | 0.080614 | 0.132287 | 0.017439 | ... | 0.050932 | 0.008416 | 0.004334 | 0.000474 | 0.000034 | 0.075918 | 0.220793 | 0.032768 | 0.023884 | 1 |
2 | 0.009208 | 0.018301 | 0.008262 | 0.025849 | 0.031677 | 0.000057 | 0.147312 | 0.136967 | 0.002352 | 0.057455 | ... | 0.008357 | 0.055791 | 0.109624 | 0.000029 | 0.007875 | 0.005629 | 0.044275 | 0.217996 | 0.002393 | 1 |
3 | 0.129434 | 0.036309 | 0.040281 | 0.056018 | 0.138173 | 0.063305 | 0.023791 | 0.021431 | 0.001766 | 0.000098 | ... | 0.008713 | 0.050279 | 0.028918 | 0.102740 | 0.002691 | 0.000420 | 0.215788 | 0.226286 | 0.014054 | 1 |
4 | 0.055134 | 0.000257 | 0.027203 | 0.406045 | 0.367124 | 0.009524 | 0.000950 | 0.040553 | 0.075501 | 0.123167 | ... | 0.003346 | 0.004510 | 0.057712 | 0.000007 | 0.181280 | 0.009843 | 0.061533 | 0.023981 | 0.006037 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9346 | 0.041404 | 0.148350 | 0.128774 | 0.000489 | 0.034969 | 0.010100 | 0.069104 | 0.044493 | 0.130157 | 0.001190 | ... | 0.510222 | 0.061467 | 0.052151 | 0.142345 | 0.010240 | 0.528946 | 0.089659 | 0.026390 | 0.088123 | 1 |
9347 | 0.007367 | 0.250150 | 0.424588 | 0.140796 | 0.002336 | 0.052449 | 0.044327 | 0.131472 | 0.035484 | 0.123832 | ... | 0.003025 | 0.015720 | 0.167906 | 0.013722 | 0.051255 | 0.302898 | 0.042692 | 0.184819 | 0.039848 | 0 |
9348 | 0.172818 | 0.046769 | 0.197307 | 0.000106 | 0.001071 | 0.024714 | 0.076101 | 0.011439 | 0.326547 | 0.034753 | ... | 0.087797 | 0.325244 | 0.009078 | 0.566709 | 0.025226 | 0.452541 | 0.064412 | 0.338149 | 0.024218 | 0 |
9349 | 0.063610 | 0.221929 | 0.000115 | 0.025852 | 0.000718 | 0.126464 | 0.029603 | 0.000026 | 0.003015 | 0.080191 | ... | 0.000170 | 0.095050 | 0.029800 | 0.004892 | 0.026643 | 0.049939 | 0.091644 | 0.015498 | 0.243298 | 1 |
9350 | 0.000026 | 0.004749 | 0.009597 | 0.075472 | 0.000965 | 0.038021 | 0.000005 | 0.128029 | 0.041665 | 0.019990 | ... | 0.263180 | 0.000784 | 0.140076 | 0.111306 | 0.001472 | 0.000657 | 0.156501 | 0.011875 | 0.091554 | 1 |
9351 rows × 129 columns
= np.array(train_labels) label
= TabularPredictor(label='label') predictr
No path specified. Models will be saved in: "AutogluonModels/ag-20240121_081618/"
predictr.fit(df)
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240121_081618/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 628.19 GB / 982.82 GB (63.9%)
Train Data Rows: 9351
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [0, 1]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 39623.22 MB
Train Data (Original) Memory Usage: 4.79 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['embedding_0', 'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['embedding_0', 'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4', ...]
0.2s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 4.79 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.24s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8415, Val Rows: 936
User-specified model hyperparameters to be fit:
{
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9e5e0188b0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
self._make_module_from_path(filepath)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.4765 = Validation score (accuracy)
0.86s = Training runtime
0.08s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f9e5e0188b0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
self._make_module_from_path(filepath)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.4765 = Validation score (accuracy)
0.86s = Training runtime
0.02s = Validation runtime
Fitting model: LightGBMXT ...
0.7147 = Validation score (accuracy)
1.73s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBM ...
0.7137 = Validation score (accuracy)
1.6s = Training runtime
0.0s = Validation runtime
Fitting model: RandomForestGini ...
0.7286 = Validation score (accuracy)
2.58s = Training runtime
0.03s = Validation runtime
Fitting model: RandomForestEntr ...
0.7436 = Validation score (accuracy)
3.05s = Training runtime
0.03s = Validation runtime
Fitting model: CatBoost ...
0.7265 = Validation score (accuracy)
4.75s = Training runtime
0.0s = Validation runtime
Fitting model: ExtraTreesGini ...
0.7276 = Validation score (accuracy)
1.22s = Training runtime
0.03s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.7361 = Validation score (accuracy)
1.24s = Training runtime
0.03s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 5: early stopping
0.7329 = Validation score (accuracy)
5.93s = Training runtime
0.01s = Validation runtime
Fitting model: XGBoost ...
0.7009 = Validation score (accuracy)
1.69s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.7254 = Validation score (accuracy)
3.74s = Training runtime
0.06s = Validation runtime
Fitting model: LightGBMLarge ...
0.7179 = Validation score (accuracy)
4.37s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.7479 = Validation score (accuracy)
0.51s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 35.14s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240121_081618/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f9e5e0fffa0>
predictr.leaderboard()
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L2 0.747863 0.117266 18.046104 0.001305 0.510860 2 True 14
1 RandomForestEntr 0.743590 0.033528 3.046299 0.033528 3.046299 1 True 6
2 ExtraTreesEntr 0.736111 0.033238 1.237712 0.033238 1.237712 1 True 9
3 NeuralNetFastAI 0.732906 0.012168 5.929674 0.012168 5.929674 1 True 10
4 RandomForestGini 0.728632 0.033677 2.584781 0.033677 2.584781 1 True 5
5 ExtraTreesGini 0.727564 0.032846 1.223976 0.032846 1.223976 1 True 8
6 CatBoost 0.726496 0.003742 4.750515 0.003742 4.750515 1 True 7
7 NeuralNetTorch 0.725427 0.058065 3.735725 0.058065 3.735725 1 True 12
8 LightGBMLarge 0.717949 0.007159 4.371737 0.007159 4.371737 1 True 13
9 LightGBMXT 0.714744 0.004040 1.732927 0.004040 1.732927 1 True 3
10 LightGBM 0.713675 0.002815 1.598863 0.002815 1.598863 1 True 4
11 XGBoost 0.700855 0.005022 1.693087 0.005022 1.693087 1 True 11
12 KNeighborsDist 0.476496 0.022600 0.860941 0.022600 0.860941 1 True 2
13 KNeighborsUnif 0.476496 0.084483 0.864856 0.084483 0.864856 1 True 1
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | WeightedEnsemble_L2 | 0.747863 | 0.117266 | 18.046104 | 0.001305 | 0.510860 | 2 | True | 14 |
1 | RandomForestEntr | 0.743590 | 0.033528 | 3.046299 | 0.033528 | 3.046299 | 1 | True | 6 |
2 | ExtraTreesEntr | 0.736111 | 0.033238 | 1.237712 | 0.033238 | 1.237712 | 1 | True | 9 |
3 | NeuralNetFastAI | 0.732906 | 0.012168 | 5.929674 | 0.012168 | 5.929674 | 1 | True | 10 |
4 | RandomForestGini | 0.728632 | 0.033677 | 2.584781 | 0.033677 | 2.584781 | 1 | True | 5 |
5 | ExtraTreesGini | 0.727564 | 0.032846 | 1.223976 | 0.032846 | 1.223976 | 1 | True | 8 |
6 | CatBoost | 0.726496 | 0.003742 | 4.750515 | 0.003742 | 4.750515 | 1 | True | 7 |
7 | NeuralNetTorch | 0.725427 | 0.058065 | 3.735725 | 0.058065 | 3.735725 | 1 | True | 12 |
8 | LightGBMLarge | 0.717949 | 0.007159 | 4.371737 | 0.007159 | 4.371737 | 1 | True | 13 |
9 | LightGBMXT | 0.714744 | 0.004040 | 1.732927 | 0.004040 | 1.732927 | 1 | True | 3 |
10 | LightGBM | 0.713675 | 0.002815 | 1.598863 | 0.002815 | 1.598863 | 1 | True | 4 |
11 | XGBoost | 0.700855 | 0.005022 | 1.693087 | 0.005022 | 1.693087 | 1 | True | 11 |
12 | KNeighborsDist | 0.476496 | 0.022600 | 0.860941 | 0.022600 | 0.860941 | 1 | True | 2 |
13 | KNeighborsUnif | 0.476496 | 0.084483 | 0.864856 | 0.084483 | 0.864856 | 1 | True | 1 |
= np.array(test_embeddings) test
test.shape
(2338, 128)
= [f'embedding_{i}' for i in range(test.shape[1])]
columns
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns)
test_df
# DataFrame 확인
print(test_df.head())
embedding_0 embedding_1 embedding_2 embedding_3 embedding_4 \
0 0.013685 0.001803 0.025815 0.352866 0.377978
1 0.002213 0.476018 0.174582 0.164626 0.026744
2 0.033581 0.123362 0.004575 0.037874 0.006698
3 0.022749 0.064797 0.000525 0.218562 0.002898
4 0.000060 0.007555 0.001758 0.257433 0.096365
embedding_5 embedding_6 embedding_7 embedding_8 embedding_9 ... \
0 0.007875 0.041015 0.035619 0.091952 0.395759 ...
1 0.013961 0.220435 0.512335 0.052540 0.000499 ...
2 0.050513 0.080717 0.059508 0.039344 0.380898 ...
3 0.006643 0.136263 0.028495 0.027063 0.027161 ...
4 0.075485 0.234585 0.037758 0.006236 0.000702 ...
embedding_118 embedding_119 embedding_120 embedding_121 embedding_122 \
0 0.017987 0.317063 0.191938 0.130174 0.159880
1 0.000210 0.163785 0.001970 0.013025 0.090202
2 0.176072 0.031670 0.134932 0.022356 0.065149
3 0.000395 0.022916 0.014586 0.101680 0.084287
4 0.036900 0.047368 0.093696 0.014713 0.091163
embedding_123 embedding_124 embedding_125 embedding_126 embedding_127
0 1.074022 0.009432 0.037776 0.009084 0.306696
1 0.293918 0.167935 0.027035 0.032836 0.001162
2 0.006532 0.124003 0.188896 0.164872 0.003926
3 0.246152 0.072217 0.357974 0.016832 0.121802
4 0.075833 0.017246 0.928035 0.111811 0.025898
[5 rows x 128 columns]
predictr.predict(test_df).mean()
0.0290846877673225
= test_labels y
= predictr.predict(test_df) yhat
# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
def evaluation(y, yhat):
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score,
sklearn.metrics.roc_auc_score]return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
evaluation(y,yhat)
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
---|---|---|---|---|---|
0 | 0.512404 | 0.691176 | 0.040309 | 0.076175 | 0.511195 |
# from sklearn.ensemble import RandomForestClassifier
# from sklearn import metrics
# classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
# for cl in classes:
# embeddings_train = cl(keyed_vectors=model_train.wv)
# train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
# test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
# rf = RandomForestClassifier(n_estimators=1000, random_state=42)
# rf.fit(train_embeddings, train_labels);
# #X=train_embeddings
# #y=train_labels
# #df=[X,y]
# # predictr = TabularPredictor(label='train_labels')
# # predictr.fit(df)
# y_pred = rf.predict(test_embeddings)
# print(cl)
# print('Precision:', metrics.precision_score(test_labels, y_pred))
# print('Recall:', metrics.recall_score(test_labels, y_pred))
# print('F1-Score:', metrics.f1_score(test_labels, y_pred))